# -*- codeing =utf-8 -*-
# @Time : 2020/12/5 20:44
# @Author : 三千xc
# @File : maoYanSpider.py
# @Software: PyCharm


import sqlite3
# 获取网页数据
import requests
# 正则表达式
import re
# 网页解析，获取数据
from bs4 import BeautifulSoup
# 保存为excel
import xlwt

#正则表达式规则 *表示多个字符，？表示0个到多个
#影片排名链接的规则
findIndex = re.compile(r'board-index.*?>(\d+).*?')
#影片图片的链接
findImage = re.compile(r'class="board-img".*?src="(.*?)"')
#影片片名
findTitle = re.compile(r'title="(.*?)">')
#影片主演
findActor = re.compile(r'class="star">(.|\n)(.*)')
#影片上映时间
findTime = re.compile(r'class="releasetime">(.*?)</p> ')
#猫眼评分
findScore1 = re.compile(r'class="integer">(.*?)</i>')
findScore2 = re.compile(r'class="fraction">(.*?)</i>')


# 爬取网页
# 解析数据
# 保存数据

def main():
    baseurl = "https://maoyan.com/board/4?offset="
    #1.爬取网页
    datalist = getData(baseurl)
    #3.保存数据
    # savepath = "猫眼TOP100.xls"
    dbpath = "movie2.db"
    # saveData(datalist, savepath)
    saveData2DB(datalist,dbpath)


def getData(baseUrl):
    datalist = []
    for i in range(0, 10):
        url = baseUrl + str(i * 10)
        html = askUrl(url)
        # 解析数据
        soup = BeautifulSoup(html, "html.parser")

        for item in soup.find_all("dd"):
            # 测试
            # print(item)
            data = []
            item = str(item)
            # 排名
            index = re.findall(findIndex, item)[0]
            data.append(index)
            # 图片地址
            image = re.findall(findImage, item)[0]
            data.append(image)
            # 标题
            title = re.findall(findTitle, item)[0]
            data.append(title)
            # 作者
            actor = re.findall(findActor, item)[0]
            actorList = list(actor)
            for i in actorList:
                actorNew = "".join(i).strip()
            data.append(actorNew)
            # 上映时间
            time = re.findall(findTime, item)[0]
            data.append(time)
            # 分数
            score1 = re.findall(findScore1, item)[0]
            # data.append(score1)
            score2 = re.findall(findScore2, item)[0]
            # data.append(score2)
            score = score1 + score2
            data.append(score)

            # print(data)
            datalist.append(data)
    #print(datalist)
    return datalist


# 爬取网页
def askUrl(url):
    #模拟浏览器头部消息，向猫眼浏览器发送消息
    headers = {
         "Accept":"* / *",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
         "Cookie":"BIDUPSID=E53D8374080760C7A143C4FD57FD9FC0; PSTM=1578902890; HMACCOUNT=6B314F8B0EFA06B7; BAIDUID=600AD9DA4AECD6C574A62250C5383D41:FG=1; BDRCVFR[Fz5t0nreVcc]=mk3SLVN4HKm; delPer=0; PSINO=6; H_PS_PSSID=; BA_HECTOR=ah0101ak2ka48k2hb31fu57qk0r; HMVT=6bcd52f51e9b3dce32bec4a3997715ac|1608687465|"}
    html = ""
    try:
        response = requests.get(url, headers=headers)
        html = response.content.decode("utf-8")
        # print(html)
    except requests.exceptions as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)

    return html

# 保存数据到excel中
def saveData(datalist, savepath):
    # 创建book对象
    book = xlwt.Workbook(encoding="utf-8")
    # 创建工作表
    sheet = book.add_sheet("猫眼TOP100", cell_overwrite_ok=True)
    col = ("电影排名", "图片地址", "电影名称", "演出人员", "上映时间", "电影评分")
    for i in range(0, 6):
        sheet.write(0, i, col[i])
        #col[i]代表列名
    for i in range(0, 100):
        print("第%d条" % (i + 1))
        try:
            data = datalist[i]
        except:
            continue
        for j in range(0, 6):
            #向表中写入数据
            sheet.write(i + 1, j, data[j])
    book.save(savepath)


# 保存数据到数据库中
def saveData2DB(datalist, dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            if index == 4:
                data[index] = '"' + str(data[index]) + '"'
                continue
            data[index] = '"' + data[index] + '"'
        sql = '''
                        insert into movie100(
                        movie_rank,image_link,name,actor,time,score
                        )
                        values(%s)''' % ",".join(data)
        # values(%s)'''%",". join('%s' %a for a in data)
        print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()


def init_db(dbpath):
    # 创建数据表
    sql = """
            create table movie100
            (
                movie_rank integer ,
                image_link text ,
                name varchar,
                actor varchar,
                time text,
                score numeric


            )
            """

    conn = sqlite3.connect(dbpath)
    # 获取游标
    cursor = conn.cursor()
    # 执行sql语句
    cursor.execute(sql)
    conn.commit()
    # 关闭数据库
    conn.close()


if __name__ == '__main__':  # 当程序执行时调用函数
    main()
    # init_db("movietest.db")
    print('爬取完成')